import random
import math
from operator import itemgetter
import chardet


class ItemBasedCF():
    # 初始化参数
    def __init__(self):
        # 找到相似的20部电影，为目标用户推荐5部电影
        self.n_sim_movie = 20
        self.n_rec_movie = 5
        # 将数据集划分为训练集和测试集
        self.trainSet = {}
        self.testSet = {}
        # 用户相似度矩阵
        self.movie_sim_matrix = {}
        self.movie_popular = {}
        self.movie_count = 0
        print('Similar movie number = %d' % self.n_sim_movie)
        print('Recommneded movie number = %d' % self.n_rec_movie)

    #用户自定义数据,都放在测试集当中
    def user_add_dataset(self):
        user = "0"
        # user = input('Please put your user id there:')
        movie_count = input('Please enter the movie_count there:')
        add_dataset = {}#此为用户添加的数据
        add_dataset.setdefault(user, {})
        for i in range(int(movie_count)):
            movie = input('Put your watched movie_id there:')
            rating = input('Put your rating:')
            add_dataset[movie] = rating
        return user, add_dataset

    # 读文件得到“用户-电影”数据
    def get_dataset(self):
        trainSet_len = 0
        testSet_len = 0
        pivot = 0.875
        i = 0
        # for line in self.load_file('ratings3.dat'):
        for line in self.load_file('recommend/ratings_sorted.csv'):
            # user, movie, rating, timestamp = line.split('::')
            user, movie, rating = line.split(',')
            if (random.random() < pivot):
                self.trainSet.setdefault(user, {})  # 相当于trainSet.get(user)，若该键不存在，则设trainSet[user] = {}，典中典
                # 键中键：形如{'1': {'1287': '2.0', '1953': '4.0', '2105': '4.0'}, '2': {'10': '4.0', '62': '3.0'}}
                # 用户1看了id为1287的电影，打分2.0
                self.trainSet[user][movie] = rating
                trainSet_len += 1
            else:
                self.testSet.setdefault(user, {})
                self.testSet[user][movie] = rating
                testSet_len += 1
        # user, user_add_testdata = self.user_add_dataset()
        # self.testSet[user].update(user_add_testdata)
        print('Split trainingSet and testSet success!')
        print('TrainSet = %s' % trainSet_len)
        print('TestSet = %s' % testSet_len)

    # 读文件，返回文件的每一行
    def load_file(self, filename):
        with open(filename, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:  # 去掉文件第一行的title
                    continue
                yield line.strip('\r\n')
        print('Load %s success!' % filename)

    # 计算电影之间的相似度
    def calc_movie_sim(self):
        for user, movies in self.trainSet.items():  # 循环取出一个用户和他看过的电影
            for movie in movies:
                if movie not in self.movie_popular:
                    self.movie_popular[movie] = 0
                self.movie_popular[movie] += 1  # 统计每部电影共被看过的次数

        self.movie_count = len(self.movie_popular)  # 得到电影总数
        print("Total movie number = %d" % self.movie_count)
        for user, movies in self.trainSet.items():  # 得到矩阵C，C[i][j]表示同时喜欢电影i和j的用户数
            for m1 in movies:
                for m2 in movies:
                    if m1 == m2:
                        continue
                    self.movie_sim_matrix.setdefault(m1, {})
                    self.movie_sim_matrix[m1].setdefault(m2, 0)
                    # self.movie_sim_matrix[m1][m2] += 1  #同时喜欢电影m1和m2的用户+1    21.75  10.5   16.67
                    self.movie_sim_matrix[m1][m2] += 1 / math.log(
                        1 + len(movies))  # ItemCF-IUF改进，惩罚了活跃用户 22.00 10.65 14.98
        print("Build co-rated users matrix success!")
        # 计算电影之间的相似性
        print("Calculating movie similarity matrix ...")
        for m1, related_movies in self.movie_sim_matrix.items():  # 电影m1，及m1这行对应的电影们
            for m2, count in related_movies.items():  # 电影m2 及 同时看了m1和m2的用户数
                # 注意0向量的处理，即某电影的用户数为0
                if self.movie_popular[m1] == 0 or self.movie_popular[m2] == 0:
                    self.movie_sim_matrix[m1][m2] = 0
                else:
                    # 计算出电影m1和m2的相似度
                    self.movie_sim_matrix[m1][m2] = count / math.sqrt(
                        self.movie_popular[m1] * self.movie_popular[m2])
        print('Calculate movie similarity matrix success!')
        # 添加归一化
        maxDict = {}
        max = 0
        for m1, related_movies in self.movie_sim_matrix.items():
            for m2, _ in related_movies.items():
                if self.movie_sim_matrix[m1][m2] > max:
                    max = self.movie_sim_matrix[m1][m2]
        for m1, related_movies in self.movie_sim_matrix.items():  # 归一化
            for m2, _ in related_movies.items():
                # self.movie_sim_matrix[m1][m2] = self.movie_sim_matrix[m1][m2] / maxDict[m2]
                self.movie_sim_matrix[m1][m2] = self.movie_sim_matrix[m1][m2] / max

    # 针对目标用户U，找到K部相似的电影，并推荐其N部电影
    def recommend(self, user):
        K = self.n_sim_movie  # 找到相似的20部电影
        N = self.n_rec_movie  # 为用户推荐5部
        rank = {}
        watched_movies = self.trainSet[user]  # 该用户看过的电影
        for movie, rating in watched_movies.items():  # 遍历用户看过的电影及对其评价
            # 找到与movie最相似的K部电影,遍历电影及与movie相似度
            for related_movie, w in sorted(self.movie_sim_matrix[movie].items(), key=itemgetter(1), reverse=True)[:K]:
                if related_movie in watched_movies:  # 如果用户已经看过了，不推荐了
                    continue
                rank.setdefault(related_movie, 0)
                rank[related_movie] += w * float(rating)  # 计算用户对该电影的兴趣
        # 返回用户最感兴趣的N部电影
        return sorted(rank.items(), key=itemgetter(1), reverse=True)[:N]

    def show(self, rec_movie, user):
        rec_result = {}
        movie_id_list = []
        rec_result.setdefault(user, {})
        for movie_id, rec in rec_movie:
            # with open('movies.dat', 'r', encoding='utf-8', errors='ignore') as f:
            with open('recommend/movies.csv', 'r', encoding='utf-8', errors='ignore') as f:
                for i, line in enumerate(f):
                    if i == 0:  # 去掉文件第一行的title
                        continue
                    # movie_id_all, movie_name, labels = line.split('::')
                    movie_id_all, movie_name, detailLink = line.split(',')
                    if movie_id == movie_id_all:
                        rec_result[user][movie_name] = rec
                        break
        return rec_result

    def evaluate(self):
        print('Evaluating start ...')
        rec_result = {}
        N = self.n_rec_movie  # 要推荐的电影数
        # 准确率和召回率
        hit = 0
        rec_count = 0
        test_count = 0
        # 覆盖率
        all_rec_movies = set()
        # .get(user, {})
        for i, user in enumerate(self.trainSet):
            # if user != '0':
            #     break
            test_moives = self.testSet.get(user, {})  # 测试集中用户喜欢的电影
            rec_movies = self.recommend(user)  # 得到推荐的电影及计算出的用户对它们的兴趣
            for movie, w in rec_movies:  # 遍历给user推荐的电影
                if movie in test_moives:  # 测试集中有该电影
                    hit += 1  # 推荐命中+1
                all_rec_movies.add(movie)
            rec_count += N
            test_count += len(test_moives)
            # print(rec_movies)
            # rec_result = self.show(rec_movies, user) #把每一个用户所推荐的电影具体表示出来
            rec_result.update(self.show(rec_movies, user))
        precision = hit / (1.0 * rec_count)
        recall = hit / (1.0 * test_count)
        coverage = len(all_rec_movies) / (1.0 * self.movie_count)
        print('precisioin=%.4f\trecall=%.4f\tcoverage=%.4f' % (precision, recall, coverage))
        print(rec_result['0'])
        return rec_result['0']

if __name__ == '__main__':
    itemCF = ItemBasedCF()
    itemCF.get_dataset()
    itemCF.calc_movie_sim()
    result = itemCF.evaluate()










